package edu.hust.WebMagicTest;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

public class CsdnPageProcessor implements PageProcessor {
	
	private Site site = Site.me().setRetrySleepTime(100).setRetryTimes(3);
	public static final String BLOG_URL = "http://blog.csdn.net/huanhuan_tiantian/article/details/\\d+";
//	public static final String LIST_URL = "http://write\\.blog\\.csdn\\.net/\\w*";
	
	public Site getSite() {
		return site;
	}

	public void process(Page page) {
//		page.putField("title", page.getHtml().xpath("//div[@class=\"article_title\"]/h1/span/a/text()"));
//		page.putField("time", page.getHtml().xpath("//div[@class=\"article_r\"]/span/text()"));
		
		//属于博客详细页面
 		if(page.getUrl().regex(BLOG_URL).match()){
			page.putField("title", page.getHtml().xpath("//span[@class=\"link_title\"]/a/text()"));
			if(page.getResultItems().get("title") == null || page.getResultItems().get("title").equals("")){
				page.setSkip(true);
			}
			page.putField("time", page.getHtml().xpath("//span[@class=\"link_postdate\"]/text()"));
			page.putField("category", page.getHtml().xpath("//div[@class=\"category_r\"]/label/span/text()").all());
			page.putField("content", page.getHtml().xpath("//div[@class=\"article_content\"]/tidyText()"));
		}else{
			page.addTargetRequests(page.getHtml().xpath("//span[@class=\"link_title\"]").links().regex(BLOG_URL).all());
		}
	}
}
